int reorg_cuda(THCudaTensor *x_tensor, int w, int h, int c, int batch, int stride, int forward, THCudaTensor *out_tensor);